This notebook is my practise to do EDA (exploratory data analysis)

I used to be interested on the daily covid data when covid hit Indonesia back in March and April. It had been a while, so let's have a look at them now

Data will be scrapped from:

Summary of this notebook:

  • Download Covid19 data for worldwide and Indonesia from several sources
  • Download GEOjson map for Indonesia
  • Worldwide: summary of latest data, worldwide map, death vs confirmed comparison
  • Indonesia: summary of latest data, summary plots (total cases, daily cases, positive rate and mortality rate) and other random stats that I am interested in

New skills I picked up and applied on this notebook:

  • First time using Git properly
  • Using Plotly Express
  • Extracting data from Google Sheet API
  • Cleaning data. The spreadsheet is messy. Table are stacked on other tables in the same spreadsheet tab
  • Extracted data is string. Not sure if there is a way to extract in a numeric format instead of converting it to float manually. For next time, maybe there is a way to just download from Google Sheet automatically and just pd.read_csv()
  • Working with GEOjson data format and plotting an interactive map

Import necessary python libraries

# download python libraries
from datetime import datetime, timedelta
import os
import glob
import wget
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
import json
import plotly.express as px
import plotly.graph_objs as go

# for offline ploting
from plotly.offline import plot, iplot, init_notebook_mode
init_notebook_mode(connected=True)

from IPython.display import HTML

Import data

# Download data from Github (daily)
os.chdir("C:/Users/Riyan Aditya/Desktop/ML_learning/Project4_EDA_Covid_Indo/datasets")

os.remove('time_series_covid19_confirmed_global.csv')
os.remove('time_series_covid19_deaths_global.csv')
os.remove('time_series_covid19_recovered_global.csv')

# urls of the files
urls = ['https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv', 
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv']

# download files
for url in urls:
    filename = wget.download(url)

100% [............................................................................] 265849 / 265849

Clean & preprocess data

# convert csv to df
confirmed_global = pd.read_csv('time_series_covid19_confirmed_global.csv')
deaths_global = pd.read_csv('time_series_covid19_deaths_global.csv')
recovered_global = pd.read_csv('time_series_covid19_recovered_global.csv')

# Melt DF => switch rows of dates into column for simpler DF
dates = confirmed_global.columns[4:]

confirmed_globalv2 = confirmed_global.melt(id_vars = ['Province/State', 'Country/Region', 'Lat', 'Long'],
                                          value_vars = dates, var_name ='Date', value_name = 'Confirmed')
deaths_globalv2 = deaths_global.melt(id_vars = ['Province/State', 'Country/Region', 'Lat', 'Long'],
                                          value_vars = dates, var_name ='Date', value_name = 'Deaths')
recovered_globalv2 = recovered_global.melt(id_vars = ['Province/State', 'Country/Region', 'Lat', 'Long'],
                                          value_vars = dates, var_name ='Date', value_name = 'Recovered')
print(confirmed_globalv2.shape)
print(deaths_globalv2.shape)
print(recovered_globalv2.shape)

(70755, 6)
(70755, 6)
(67310, 6)

Why are there differences in number of rows between confirmed (or death) & recovered?

This seems to suggest some countries are missing their data

# Combine df
covid_global = confirmed_globalv2.merge(deaths_globalv2, how='left', on = 
                                        ['Province/State', 'Country/Region', 'Lat', 'Long','Date']).merge(
                                        recovered_globalv2, how='left', on =
                                        ['Province/State', 'Country/Region', 'Lat', 'Long','Date'])

# preprocessing
covid_global['Date'] = pd.to_datetime(covid_global['Date'])

#active cases
covid_global['Active'] = covid_global['Confirmed'] - covid_global['Deaths'] - covid_global['Recovered']

Grouby data by day

# Data by day
covid_global_daily = covid_global.groupby('Date')['Confirmed','Deaths','Recovered','Active'].sum().reset_index()

Grouby data by country

# Data by country
temp = covid_global[covid_global['Date'] ==max(covid_global['Date'])].reset_index(drop=True).drop('Date', axis = 1)
covid_global_percountry = temp.groupby('Country/Region')['Confirmed','Deaths','Recovered','Active'].sum().reset_index()

Worldwide Data Viz

Latest data

Show latest data

# latest data
print('Date today',covid_global_daily['Date'].iloc[-1])
print('Total cases','{:,}'.format(covid_global_daily['Confirmed'].iloc[-1]))
print('Active cases','{:,}'.format(covid_global_daily['Active'].iloc[-1]))
print('Recovered cases','{:,}'.format(covid_global_daily['Recovered'].iloc[-1]))
print('Deaths cases','{:,}'.format(covid_global_daily['Deaths'].iloc[-1]))

Date today 2020-10-12 00:00:00
Total cases 37,801,526
Active cases 9,224,575.0
Recovered cases 26,108,249.0
Deaths cases 997,143.0

We almost reach 1M global death =(

# plot
temp = covid_global_daily[['Date','Deaths','Recovered','Active']].tail(1)
temp = temp.melt(id_vars='Date',value_vars = ['Active','Deaths','Recovered'])
fig = px.treemap(temp, path=['variable'],values = 'value', height = 225)
fig.data[0].textinfo = 'label+text+value'

HTML(fig.to_html(include_plotlyjs='cdn'))

Total confirmed cases world map

World map interactive plot

def plot_map(df, col, pal):
    df = df[df[col]>0]
    fig2 = px.choropleth(df, locations="Country/Region", locationmode='country names', 
                  color=col, hover_name="Country/Region", 
                  title=col, hover_data=[col], color_continuous_scale=pal)
    fig2.update_layout(coloraxis_showscale=False)
    return fig2
    

fig2 = plot_map(covid_global_percountry, 'Confirmed', 'matter')
HTML(fig2.to_html(include_plotlyjs='cdn'))

Treemap total confirmed cases

def plot_treemap(df,col):
    fig3 = px.treemap(df, path=["Country/Region"], values=col, height=700,
                 title=col, color_discrete_sequence = px.colors.qualitative.Dark2)
    fig3.data[0].textinfo = 'label+text+value'
    return fig3
    

fig3 = plot_treemap(covid_global_percountry,'Confirmed')
HTML(fig3.to_html(include_plotlyjs='cdn'))

Death vs confirmed

For top 50 countries with the highest total cases

def human_format(num):
    magnitude = 0
    while abs(num) >= 1000:
        magnitude += 1
        num /= 1000.0
    # add more suffixes if you need them
    return '%.2f%s' % (num, ['', 'K', 'M', 'G', 'T', 'P'][magnitude])

# plot
fig4 = px.scatter(covid_global_percountry.sort_values('Deaths', ascending=False).iloc[:50, :], 
                 x='Confirmed', y='Deaths', color='Country/Region', size='Confirmed', 
                 height=700, text='Country/Region', log_x=True, log_y=True, 
                 title='Deaths vs Confirmed (Scale is in log10)',
                hover_data={'Country/Region':True,'Confirmed':':,','Deaths':':,'})
fig4.update_traces(textposition='top center')
fig4.update_layout(showlegend=False)
HTML(fig4.to_html(include_plotlyjs='cdn'))

Indonesia

Data starts 18 Mar

Import data

# find how many days in between 18 Mar to today (so we know how many rows to download)
from datetime import date

d0_total = date(2020, 3, 18)
d0_harian = date(2020,3,15)
d0_aktif = date(2020,3,21)
d0_sembuh = date(2020,3,21)
d0_deaths = date(2020,3,18)
d1 = date.today()

delta_total = d1 - d0_total
delta_harian = d1 - d0_harian
delta_aktif = d1 - d0_aktif
delta_sembuh = d1 - d0_sembuh
delta_deaths = d1 - d0_deaths

All df imported are strings. need to convert to int. Also had to remove all the comma in the thousand separator.

Note: you need "Credentials.json". You can get one by following this example: https://developers.google.com/sheets/api/quickstart/python

Code to import data from KawalCOVID:

# %load GoogleSheet from KawalCovid19
from __future__ import print_function
import pickle
import os.path
from googleapiclient.discovery import build
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request

# If modifying these scopes, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/spreadsheets.readonly']

# The ID and range of a sample spreadsheet.
SAMPLE_SPREADSHEET_ID = '1ma1T9hWbec1pXlwZ89WakRk-OfVUQZsOCFl4FwZxzVw'


def main():
    """Shows basic usage of the Sheets API.
    Prints values from a sample spreadsheet.
    """
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is
    # created automatically when the authorization flow completes for the first
    # time.
    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    service = build('sheets', 'v4', credentials=creds)
    


    # Call the Sheets API for total kasus ----------------------------------------------------------------
    sheet = service.spreadsheets()
    
    SAMPLE_RANGE_NAME = 'Timeline!A1:AZ'
    print(SAMPLE_RANGE_NAME)
    
    result = sheet.values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID,
                                range=SAMPLE_RANGE_NAME).execute()
    values = result.get('values', [])
    df1 = pd.DataFrame(values)
    
    # Call the Sheets API for Statistik harian  ----------------------------------------------------------------
    sheet = service.spreadsheets()
    SAMPLE_RANGE_NAME = 'Statistik Harian!A1:AL'
    print(SAMPLE_RANGE_NAME)
    
    result = sheet.values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID,
                                range=SAMPLE_RANGE_NAME).execute()
    values = result.get('values', [])
    
    df = pd.DataFrame(values)
    headers = df.iloc[0]
    covid_id  = pd.DataFrame(df.values[1:], columns=headers)
    covid_id.columns.values[0] = "Dates"
    covid_id = covid_id.replace(',','', regex=True).replace('-',' ', regex=True)
    covid_id = covid_id.replace('',0, regex=True)
    covid_id = covid_id.replace('#DIV/0!',0, regex=True)
    covid_id = covid_id.replace('#REF!',0, regex=True)
    covid_id = covid_id.set_index('Dates')
    covid_id = covid_id.replace('%','',regex=True).astype('float')/100
    covid_id = covid_id.astype('float')*100
    
    covid_id.index = pd.to_datetime(covid_id.index, format='%d %b')
    covid_id.index = covid_id.index + pd.DateOffset(year=2020)
    
    # Call the Sheets API for Population ----------------------------------------------------------------
    sheet = service.spreadsheets()
    
    SAMPLE_RANGE_NAME = 'Copy of Data PCR - Kemenkes!J2:M36'
    print(SAMPLE_RANGE_NAME)
    
    result = sheet.values().get(spreadsheetId=SAMPLE_SPREADSHEET_ID,
                                range=SAMPLE_RANGE_NAME).execute()
    values = result.get('values', [])
    df2 = pd.DataFrame(values)

    # return DF --------------------------------------------------------------------------------------------------
    return df1, covid_id, df2
    

indo_covid, covid_id, pop_id = main()

Timeline!A1:AZ
Statistik Harian!A1:AL
C:\Users\Riyan Aditya\Anaconda3\lib\site-packages\pandas\core\arrays\datetimes.py:837: PerformanceWarning:

Non-vectorized DateOffset being applied to Series or DatetimeIndex

Copy of Data PCR - Kemenkes!J2:M36

Clean data

indo_covid2 = indo_covid.copy()
indo_covid2 = indo_covid2.iloc[:, :-1]

# find index of relevant tables
index_tk = indo_covid2.loc[indo_covid2[indo_covid2.columns[0]] == 'Total Kasus'].index[0]
index_ka = indo_covid2.loc[indo_covid2[indo_covid2.columns[0]] == 'Kasus Aktif'].index[0]
index_ks = indo_covid2.loc[indo_covid2[indo_covid2.columns[0]] == 'Sembuh'].index[0]
index_km = indo_covid2.loc[indo_covid2[indo_covid2.columns[0]] == 'Meninggal Dunia'].index[0]

#index_tk, index_ka, index_ks, index_km

#Split table into total cases, active cases, recovered cases and deaths
#Then set dates as the index

total_cases =  indo_covid2[index_tk:index_tk+delta_total.days+1:].rename(
    columns=indo_covid2.iloc[0]).drop(indo_covid2.index[0]).replace('-',' ', regex=True).set_index('Total Kasus')
active_cases = indo_covid2[index_ka:index_ka+delta_aktif.days+1:].rename(
    columns=indo_covid2.iloc[index_ka]).drop(indo_covid2.index[index_ka]).replace('-',' ', regex=True).set_index('Kasus Aktif')
recovered_cases = indo_covid2[index_ks:index_ks+delta_sembuh.days+1:].rename(
    columns=indo_covid2.iloc[index_ks]).drop(indo_covid2.index[index_ks]).replace('-',' ', regex=True).set_index('Sembuh')
deaths_cases = indo_covid2[index_km:index_km+delta_deaths.days+1:].rename(
    columns=indo_covid2.iloc[index_km]).drop(indo_covid2.index[index_km]).replace('-',' ', regex=True).set_index('Meninggal Dunia')

# clean df
def clean_df(df):
    
    mask = df.applymap(lambda x: x is None)
    cols = df.columns[(mask).any()]
    for col in df[cols]:
        df.loc[mask[col], col] = 0
    
    df = df.replace(',','', regex=True).replace('',0, regex=True)
    df = df.astype('float64')
    df.index = pd.to_datetime(df.index, format='%d %b')
    df.index = df.index + pd.DateOffset(year=2020)
    
    return df


total_cases = clean_df(total_cases)    
active_cases = clean_df(active_cases)
recovered_cases = clean_df(recovered_cases)
deaths_cases = clean_df(deaths_cases)

# generate new cases, new recovered, new deaths
new_cases = total_cases.diff()
new_recovered = recovered_cases.diff()
new_deaths = deaths_cases.diff()

Latest Indonesian data

# display latest data
print(d1)
print('Total cases','{:,}'.format(covid_id['Total kasus'][-1]))
print('Active cases','{:,}'.format(covid_id['Kasus aktif'][-1]))
print('Total recovered','{:,}'.format(covid_id['Sembuh'][-1]))
print('Total deaths','{:,}'.format(covid_id['Meninggal\nDunia'][-1]))

2020-10-13
Total cases 336,716.0
Active cases 66,262.0
Total recovered 258,519.0
Total deaths 11,935.0

Total cases plot

Latest Indonesian Covid data:

# codes
import matplotlib.ticker as mtick
from matplotlib.dates import DateFormatter

fig, ax = plt.subplots(1, 1, figsize=(15,5))

ax.plot(covid_id.index,covid_id['Total kasus'],'-D',markersize = 3)

plt.yticks(np.arange(0,400001,100000), fontsize=16)
plt.xticks( fontsize=16)
plt.ylabel('Total cases', fontsize=18)

ax.get_yaxis().set_major_formatter(mtick.FuncFormatter(lambda x, p: format(int(x), ',')))
myFmt = DateFormatter("%b")
ax.xaxis.set_major_formatter(myFmt)

plt.show()

C:\Users\Riyan Aditya\Anaconda3\lib\site-packages\pandas\plotting\_matplotlib\converter.py:103: FutureWarning:

Using an implicitly registered datetime converter for a matplotlib plotting method. The converter was registered by pandas on import. Future versions of pandas will require you to explicitly register matplotlib converters.

To register the converters:
	>>> from pandas.plotting import register_matplotlib_converters
	>>> register_matplotlib_converters()

Daily cases

# codes

fig, ax = plt.subplots(1, 1, figsize=(15,5))
ln1 = ax.plot(covid_id.index,covid_id['Kasus baru'],'-D',markersize = 3,label = 'Daily cases')
ax2 = ax.twinx()
ln2 = ax2.bar(covid_id.index,covid_id['Spesimen'],alpha = 0.2, color = 'black',label = 'Daily tests')

ax.tick_params(axis='both', which='major', labelsize=16)
ax2.tick_params(axis='both', which='major', labelsize=16)
ax2.set_ylim([0,60000])
ax.get_yaxis().set_major_formatter(mtick.FuncFormatter(lambda x, p: format(int(x), ',')))
ax2.get_yaxis().set_major_formatter(mtick.FuncFormatter(lambda x, p: format(int(x), ',')))
ax.set_ylabel('Daily cases',fontsize = 18)
ax2.set_ylabel('Daily tests',fontsize = 18)

myFmt = DateFormatter("%b")
ax.xaxis.set_major_formatter(myFmt)

#legend
fig.legend(bbox_to_anchor=(0.2,1), bbox_transform=ax.transAxes, fontsize = 16, frameon=False)


plt.show()

Unsurprisingly, daily cases are highly correlated with number of daily tests

Positive rate

This is a 7 day rolling average of new cases divided by number of people that are tested.

About 16% of the population tested returned a positive Covid test result.

# codes
import matplotlib.ticker as mtick
from matplotlib.dates import DateFormatter

fig, ax = plt.subplots(1, 1, figsize=(15,5))

avg_pos_rate = covid_id['Positive rate harian'].mean()
ax.plot(covid_id.index,covid_id['Positive rate mingguan'],'-D',markersize = 3)
plt.axhline(y=avg_pos_rate, color='k', linestyle='--', alpha = 0.3)
plt.text(pd.to_datetime("2020-05-01"),avg_pos_rate+0.02,"average positive rate: "+str(round(avg_pos_rate,1))+"%",fontsize = 14)


ax.set_ylim([0,40])
plt.yticks(np.arange(0,41,10), fontsize=16)
plt.xticks( fontsize=16)
plt.ylabel('Weekly positive rate (%)', fontsize=18)

ax.get_yaxis().set_major_formatter(mtick.FuncFormatter(lambda x, p: format(int(x), ',')))
myFmt = DateFormatter("%b")
ax.xaxis.set_major_formatter(myFmt)

plt.show()

Mortality rate

Average mortality rate is 5.5. However, this is likely affected by the data collected during March. This seems to indicate data was not collected (or the data collection is not fully functional yet). More accurate perhaps is the recent mortality rate, which is around

# codes

#death_rate = df_indo['2020-03-1'::].new_deaths/df_indo['2020-03-1'::].new_cases*100
avg_death_rate = covid_id['Tingkat kematian (seluruh kasus)'].mean()

fig, ax = plt.subplots(1, 1, figsize=(15,5))

ax.plot(covid_id.index, covid_id['Tingkat kematian (seluruh kasus)'] ,'-D',color ='b',markersize = 3,alpha = 0.5)
plt.axhline(y=avg_death_rate, color='b', linestyle='--', alpha = 0.3)
plt.text(pd.to_datetime("2020-08-01"),avg_death_rate+0.5,"average mortality rate: "+str(round(avg_death_rate,1))+'%',fontsize = 14,color='blue')
plt.axhline(y=3.6, color='g', linestyle='--', alpha = 0.3)
plt.text(pd.to_datetime("2020-04-01"),3.2-1.3,"Recent mortality rate?: "+str(3.6)+'%',fontsize = 14,color='g')

plt.yticks(np.arange(0,15.1,5), fontsize=16)
plt.xticks( fontsize=16)
plt.ylabel('Mortality rate', fontsize=18)

myFmt = DateFormatter("%b")
ax.xaxis.set_major_formatter(myFmt)

plt.show()

Interactive Province Map

# Load GEOjson data
with open('IDN_adm_1_province.json') as data_file:
    indo_map = json.load(data_file)

# temp
a = []
for x in range(len(indo_map['features'])):
        y = indo_map["features"][x]['properties']['NAME_1']
        a.append(y)

# transpose total cases per province
indo_cases = total_cases.tail(1).T.reset_index()
indo_cases.columns = ['Province','Total_cases']
indo_cases = indo_cases[:-1]

# rename Province based on JSON name
indo_cases['Province'] = ['Aceh','Bali','Banten','Bangka-Belitung','Bengkulu','Yogyakarta','Jakarta Raya','Jambi',
                            'Jawa Barat','Jawa Tengah','Jawa Timur','Kalimantan Barat','Kalimantan Timur',
                             'Kalimantan Tengah','Kalimantan Selatan','Kalimantan Utara','Kepulauan Riau',
                             'Nusa Tenggara Barat','Sumatera Selatan','Sumatera Barat','Sulawesi Utara',
                          'Sumatera Utara','Sulawesi Tenggara','Sulawesi Selatan','Sulawesi Tengah','Lampung',
                          'Riau','Maluku Utara','Maluku','Irian Jaya Barat','Papua','Sulawesi Barat',
                          'Nusa Tenggara Timur','Gorontalo']

# transpose new cases per province
indo_cases2 = new_cases.tail(1).T.reset_index()
indo_cases2.columns = ['Province','New_cases']
indo_cases2 = indo_cases2[:-1]

#combine DF
indo_cases['New_cases'] = indo_cases2['New_cases']

# plot map

fig5 = px.choropleth(indo_cases, geojson=indo_map, locations=indo_cases['Province'],
                    color=indo_cases['Total_cases'], # lifeExp is a column of gapminder
                    color_continuous_scale=px.colors.sequential.Reds,featureidkey="properties.NAME_1")


fig5.update_geos(fitbounds="locations")
fig5.update_layout(title = 'Total cases per province')
HTML(fig5.to_html(include_plotlyjs='cdn'))

# plot map

fig6 = px.choropleth(indo_cases, geojson=indo_map, locations=indo_cases['Province'],
                    color=indo_cases['New_cases'], # lifeExp is a column of gapminder
                    color_continuous_scale=px.colors.sequential.Reds,featureidkey="properties.NAME_1")


fig6.update_geos(fitbounds="locations")
fig6.update_layout(title = 'New cases per province ('+str(d1)+')')
fig6.update_layout(coloraxis_colorbar=dict(title='Daily cases'))

HTML(fig6.to_html(include_plotlyjs='cdn'))

Other stats